In [1]:
    
import requests
from lxml import html
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from pprint import pprint
import re
%matplotlib inline
    
In [2]:
    
url = "https://www.washingtonpost.com/news/the-fix/wp/2016/01/17/the-4th-democratic-debate-transcript-annotated-who-said-what-and-what-it-meant/"
response = requests.get(url)
    
In [3]:
    
doc = html.fromstring(response.text)
    
In [4]:
    
para_list = doc.xpath("//article/p/text()")
    
In [5]:
    
para_list = para_list[2:]
    
In [6]:
    
pprint(para_list[:2], compact=True)
print(para_list[-2:])
    
    
["HOLT: We'll begin with 45 second opening statements from each candidate, "
 'starting with Secretary Clinton. ',
 'CLINTON: Well, good evening. And I want to thank the Congressional Black '
 'Caucus Institute and the people of Charleston for hosting us here on the '
 'eve of Martin Luther King Day tomorrow. ']
['HOLT: All right. Well thank you and thanks to all of you for being here tonight shedding light on some of the differences as Americans get ready to vote. ', "I also want to thank the Congressional Black Caucus Institute and certainly my friend and colleague, Andrea Mitchell. This has been great. It's been a great spirited conversation and American people appreciate it. "]
In [7]:
    
dataset = pd.DataFrame(para_list, columns=["raw"])
dataset
    
    Out[7]:
  
    
       
      raw 
     
  
  
    
      0 
      HOLT: We'll begin with 45 second opening state... 
     
    
      1 
      CLINTON: Well, good evening. And I want to tha... 
     
    
      2 
      You know, I remember well when my youth minist... 
     
    
      3 
      And that is our fight still. We have to get th... 
     
    
      4 
      I understand that this is the hardest job in t... 
     
    
      5 
      (APPLAUSE) 
     
    
      6 
      HOLT: Thank you. Senator Sanders, your opening... 
     
    
      7 
      SANDERS: Thank you. As we honor the extraordin... 
     
    
      8 
      SANDERS: And then, to make a bad situation wor... 
     
    
      9 
      This campaign is about a political revolution ... 
     
    
      10 
      HOLT: Senator, thank you. 
     
    
      11 
      (APPLAUSE) 
     
    
      12 
      And Governor O'Malley, your opening statement,... 
     
    
      13 
      O'MALLEY: Thank you. My name is Martin O'Malle... 
     
    
      14 
      And I want to thank the people of South Caroli... 
     
    
      15 
      You taught us, in fact, in keeping with Dr. Ki... 
     
    
      16 
      Eight years ago, you brought forward a new lea... 
     
    
      17 
      But in order to make good on the promise of eq... 
     
    
      18 
      We need new leadership. We need to come togeth... 
     
    
      19 
      That's why I'm running for president. I need y... 
     
    
      20 
      Thank you. 
     
    
      21 
      HOLT: All right. And Governor, thank you. 
     
    
      22 
      (APPLAUSE) 
     
    
      23 
      HOLT: All right, to our first question, now. T... 
     
    
      24 
      President Obama came to office determined to s... 
     
    
      25 
      Senator Sanders. 
     
    
      26 
      SANDERS: Well, that's what our campaign is abo... 
     
    
      27 
      So, what my first days are about is bringing A... 
     
    
      28 
      (APPLAUSE) 
     
    
      29 
      HOLT: Secretary Clinton, same question, my fir... 
     
    
      ... 
      ... 
     
    
      592 
      HOLT: Welcome back everybody. Finally, before ... 
     
    
      593 
      And, we'll start with Governor O'Malley. 
     
    
      594 
      (LAUGHTER) 
     
    
      595 
      HOLT: Didn't see that coming, did you? 
     
    
      596 
      O'MALLEY: Yes, but we're going to have to get ... 
     
    
      597 
      (LAUGHTER) 
     
    
      598 
      MITCHELL: ...too long (ph). 
     
    
      599 
      O'MALLEY: I believe there are many issues. I h... 
     
    
      600 
      HOLT: Sixty seconds, we'd appreciate it. 
     
    
      601 
      O'MALLEY: There are so many issues that we hav... 
     
    
      602 
      (APPLAUSE) 
     
    
      603 
      O'MALLEY: We haven't discussed the fact that i... 
     
    
      604 
      I guess the bottom line is this, look we are a... 
     
    
      605 
      We're on the threshold of a new era of America... 
     
    
      606 
      HOLT: And that's time. 
     
    
      607 
      O'MALLEY: Thanks a lot. 
     
    
      608 
      HOLT: Secretary Clinton? 
     
    
      609 
      CLINTON: Well Lester, I spent a lot of time la... 
     
    
      610 
      He had request for help and he had basically s... 
     
    
      611 
      So I sent my top campaign aide down there to t... 
     
    
      612 
      HOLT: And that's time. 
     
    
      613 
      CLINTON: I want to be a president who takes ca... 
     
    
      614 
      (APPLAUSE) 
     
    
      615 
      HOLT: Thank you. 
     
    
      616 
      Senator Sanders? 
     
    
      617 
      SANDERS: Well, Secretary Clinton was right and... 
     
    
      618 
      Now, we are a great nation -- and we've heard ... 
     
    
      619 
      We've got to get rid of Super PACs, we've got ... 
     
    
      620 
      HOLT: All right. Well thank you and thanks to ... 
     
    
      621 
      I also want to thank the Congressional Black C... 
     
  
622 rows × 1 columns
In [8]:
    
def get_name(x):
    r = re.findall(r"^([A-Z']*):", x)
    if r:
        return r[0]
    else:
        return np.NaN
    
In [9]:
    
dataset["speaker"] = dataset.raw.apply(get_name).fillna(method='ffill')
dataset
    
    Out[9]:
  
    
       
      raw 
      speaker 
     
  
  
    
      0 
      HOLT: We'll begin with 45 second opening state... 
      HOLT 
     
    
      1 
      CLINTON: Well, good evening. And I want to tha... 
      CLINTON 
     
    
      2 
      You know, I remember well when my youth minist... 
      CLINTON 
     
    
      3 
      And that is our fight still. We have to get th... 
      CLINTON 
     
    
      4 
      I understand that this is the hardest job in t... 
      CLINTON 
     
    
      5 
      (APPLAUSE) 
      CLINTON 
     
    
      6 
      HOLT: Thank you. Senator Sanders, your opening... 
      HOLT 
     
    
      7 
      SANDERS: Thank you. As we honor the extraordin... 
      SANDERS 
     
    
      8 
      SANDERS: And then, to make a bad situation wor... 
      SANDERS 
     
    
      9 
      This campaign is about a political revolution ... 
      SANDERS 
     
    
      10 
      HOLT: Senator, thank you. 
      HOLT 
     
    
      11 
      (APPLAUSE) 
      HOLT 
     
    
      12 
      And Governor O'Malley, your opening statement,... 
      HOLT 
     
    
      13 
      O'MALLEY: Thank you. My name is Martin O'Malle... 
      O'MALLEY 
     
    
      14 
      And I want to thank the people of South Caroli... 
      O'MALLEY 
     
    
      15 
      You taught us, in fact, in keeping with Dr. Ki... 
      O'MALLEY 
     
    
      16 
      Eight years ago, you brought forward a new lea... 
      O'MALLEY 
     
    
      17 
      But in order to make good on the promise of eq... 
      O'MALLEY 
     
    
      18 
      We need new leadership. We need to come togeth... 
      O'MALLEY 
     
    
      19 
      That's why I'm running for president. I need y... 
      O'MALLEY 
     
    
      20 
      Thank you. 
      O'MALLEY 
     
    
      21 
      HOLT: All right. And Governor, thank you. 
      HOLT 
     
    
      22 
      (APPLAUSE) 
      HOLT 
     
    
      23 
      HOLT: All right, to our first question, now. T... 
      HOLT 
     
    
      24 
      President Obama came to office determined to s... 
      HOLT 
     
    
      25 
      Senator Sanders. 
      HOLT 
     
    
      26 
      SANDERS: Well, that's what our campaign is abo... 
      SANDERS 
     
    
      27 
      So, what my first days are about is bringing A... 
      SANDERS 
     
    
      28 
      (APPLAUSE) 
      SANDERS 
     
    
      29 
      HOLT: Secretary Clinton, same question, my fir... 
      HOLT 
     
    
      ... 
      ... 
      ... 
     
    
      592 
      HOLT: Welcome back everybody. Finally, before ... 
      HOLT 
     
    
      593 
      And, we'll start with Governor O'Malley. 
      HOLT 
     
    
      594 
      (LAUGHTER) 
      HOLT 
     
    
      595 
      HOLT: Didn't see that coming, did you? 
      HOLT 
     
    
      596 
      O'MALLEY: Yes, but we're going to have to get ... 
      O'MALLEY 
     
    
      597 
      (LAUGHTER) 
      O'MALLEY 
     
    
      598 
      MITCHELL: ...too long (ph). 
      MITCHELL 
     
    
      599 
      O'MALLEY: I believe there are many issues. I h... 
      O'MALLEY 
     
    
      600 
      HOLT: Sixty seconds, we'd appreciate it. 
      HOLT 
     
    
      601 
      O'MALLEY: There are so many issues that we hav... 
      O'MALLEY 
     
    
      602 
      (APPLAUSE) 
      O'MALLEY 
     
    
      603 
      O'MALLEY: We haven't discussed the fact that i... 
      O'MALLEY 
     
    
      604 
      I guess the bottom line is this, look we are a... 
      O'MALLEY 
     
    
      605 
      We're on the threshold of a new era of America... 
      O'MALLEY 
     
    
      606 
      HOLT: And that's time. 
      HOLT 
     
    
      607 
      O'MALLEY: Thanks a lot. 
      O'MALLEY 
     
    
      608 
      HOLT: Secretary Clinton? 
      HOLT 
     
    
      609 
      CLINTON: Well Lester, I spent a lot of time la... 
      CLINTON 
     
    
      610 
      He had request for help and he had basically s... 
      CLINTON 
     
    
      611 
      So I sent my top campaign aide down there to t... 
      CLINTON 
     
    
      612 
      HOLT: And that's time. 
      HOLT 
     
    
      613 
      CLINTON: I want to be a president who takes ca... 
      CLINTON 
     
    
      614 
      (APPLAUSE) 
      CLINTON 
     
    
      615 
      HOLT: Thank you. 
      HOLT 
     
    
      616 
      Senator Sanders? 
      HOLT 
     
    
      617 
      SANDERS: Well, Secretary Clinton was right and... 
      SANDERS 
     
    
      618 
      Now, we are a great nation -- and we've heard ... 
      SANDERS 
     
    
      619 
      We've got to get rid of Super PACs, we've got ... 
      SANDERS 
     
    
      620 
      HOLT: All right. Well thank you and thanks to ... 
      HOLT 
     
    
      621 
      I also want to thank the Congressional Black C... 
      HOLT 
     
  
622 rows × 2 columns
In [10]:
    
dataset.speaker.value_counts()
    
    Out[10]:
SANDERS     168
HOLT        152
CLINTON     131
O'MALLEY    113
MITCHELL     43
TODD          7
BROWNLEE      4
FRANTA        2
MILLER        2
Name: speaker, dtype: int64
In [11]:
    
get_speach = lambda x: re.sub("^[A-Z']*:\s", "", x)
dataset["speach"] = dataset.raw.apply(get_speach)
dataset
    
    Out[11]:
  
    
       
      raw 
      speaker 
      speach 
     
  
  
    
      0 
      HOLT: We'll begin with 45 second opening state... 
      HOLT 
      We'll begin with 45 second opening statements ... 
     
    
      1 
      CLINTON: Well, good evening. And I want to tha... 
      CLINTON 
      Well, good evening. And I want to thank the Co... 
     
    
      2 
      You know, I remember well when my youth minist... 
      CLINTON 
      You know, I remember well when my youth minist... 
     
    
      3 
      And that is our fight still. We have to get th... 
      CLINTON 
      And that is our fight still. We have to get th... 
     
    
      4 
      I understand that this is the hardest job in t... 
      CLINTON 
      I understand that this is the hardest job in t... 
     
    
      5 
      (APPLAUSE) 
      CLINTON 
      (APPLAUSE) 
     
    
      6 
      HOLT: Thank you. Senator Sanders, your opening... 
      HOLT 
      Thank you. Senator Sanders, your opening state... 
     
    
      7 
      SANDERS: Thank you. As we honor the extraordin... 
      SANDERS 
      Thank you. As we honor the extraordinary life ... 
     
    
      8 
      SANDERS: And then, to make a bad situation wor... 
      SANDERS 
      And then, to make a bad situation worse, we ha... 
     
    
      9 
      This campaign is about a political revolution ... 
      SANDERS 
      This campaign is about a political revolution ... 
     
    
      10 
      HOLT: Senator, thank you. 
      HOLT 
      Senator, thank you. 
     
    
      11 
      (APPLAUSE) 
      HOLT 
      (APPLAUSE) 
     
    
      12 
      And Governor O'Malley, your opening statement,... 
      HOLT 
      And Governor O'Malley, your opening statement,... 
     
    
      13 
      O'MALLEY: Thank you. My name is Martin O'Malle... 
      O'MALLEY 
      Thank you. My name is Martin O'Malley, I was b... 
     
    
      14 
      And I want to thank the people of South Caroli... 
      O'MALLEY 
      And I want to thank the people of South Caroli... 
     
    
      15 
      You taught us, in fact, in keeping with Dr. Ki... 
      O'MALLEY 
      You taught us, in fact, in keeping with Dr. Ki... 
     
    
      16 
      Eight years ago, you brought forward a new lea... 
      O'MALLEY 
      Eight years ago, you brought forward a new lea... 
     
    
      17 
      But in order to make good on the promise of eq... 
      O'MALLEY 
      But in order to make good on the promise of eq... 
     
    
      18 
      We need new leadership. We need to come togeth... 
      O'MALLEY 
      We need new leadership. We need to come togeth... 
     
    
      19 
      That's why I'm running for president. I need y... 
      O'MALLEY 
      That's why I'm running for president. I need y... 
     
    
      20 
      Thank you. 
      O'MALLEY 
      Thank you. 
     
    
      21 
      HOLT: All right. And Governor, thank you. 
      HOLT 
      All right. And Governor, thank you. 
     
    
      22 
      (APPLAUSE) 
      HOLT 
      (APPLAUSE) 
     
    
      23 
      HOLT: All right, to our first question, now. T... 
      HOLT 
      All right, to our first question, now. The fir... 
     
    
      24 
      President Obama came to office determined to s... 
      HOLT 
      President Obama came to office determined to s... 
     
    
      25 
      Senator Sanders. 
      HOLT 
      Senator Sanders. 
     
    
      26 
      SANDERS: Well, that's what our campaign is abo... 
      SANDERS 
      Well, that's what our campaign is about. It is... 
     
    
      27 
      So, what my first days are about is bringing A... 
      SANDERS 
      So, what my first days are about is bringing A... 
     
    
      28 
      (APPLAUSE) 
      SANDERS 
      (APPLAUSE) 
     
    
      29 
      HOLT: Secretary Clinton, same question, my fir... 
      HOLT 
      Secretary Clinton, same question, my first 100... 
     
    
      ... 
      ... 
      ... 
      ... 
     
    
      592 
      HOLT: Welcome back everybody. Finally, before ... 
      HOLT 
      Welcome back everybody. Finally, before we go ... 
     
    
      593 
      And, we'll start with Governor O'Malley. 
      HOLT 
      And, we'll start with Governor O'Malley. 
     
    
      594 
      (LAUGHTER) 
      HOLT 
      (LAUGHTER) 
     
    
      595 
      HOLT: Didn't see that coming, did you? 
      HOLT 
      Didn't see that coming, did you? 
     
    
      596 
      O'MALLEY: Yes, but we're going to have to get ... 
      O'MALLEY 
      Yes, but we're going to have to get 20 minutes... 
     
    
      597 
      (LAUGHTER) 
      O'MALLEY 
      (LAUGHTER) 
     
    
      598 
      MITCHELL: ...too long (ph). 
      MITCHELL 
      ...too long (ph). 
     
    
      599 
      O'MALLEY: I believe there are many issues. I h... 
      O'MALLEY 
      I believe there are many issues. I have 60 sec... 
     
    
      600 
      HOLT: Sixty seconds, we'd appreciate it. 
      HOLT 
      Sixty seconds, we'd appreciate it. 
     
    
      601 
      O'MALLEY: There are so many issues that we hav... 
      O'MALLEY 
      There are so many issues that we haven't been ... 
     
    
      602 
      (APPLAUSE) 
      O'MALLEY 
      (APPLAUSE) 
     
    
      603 
      O'MALLEY: We haven't discussed the fact that i... 
      O'MALLEY 
      We haven't discussed the fact that in our hemi... 
     
    
      604 
      I guess the bottom line is this, look we are a... 
      O'MALLEY 
      I guess the bottom line is this, look we are a... 
     
    
      605 
      We're on the threshold of a new era of America... 
      O'MALLEY 
      We're on the threshold of a new era of America... 
     
    
      606 
      HOLT: And that's time. 
      HOLT 
      And that's time. 
     
    
      607 
      O'MALLEY: Thanks a lot. 
      O'MALLEY 
      Thanks a lot. 
     
    
      608 
      HOLT: Secretary Clinton? 
      HOLT 
      Secretary Clinton? 
     
    
      609 
      CLINTON: Well Lester, I spent a lot of time la... 
      CLINTON 
      Well Lester, I spent a lot of time last week b... 
     
    
      610 
      He had request for help and he had basically s... 
      CLINTON 
      He had request for help and he had basically s... 
     
    
      611 
      So I sent my top campaign aide down there to t... 
      CLINTON 
      So I sent my top campaign aide down there to t... 
     
    
      612 
      HOLT: And that's time. 
      HOLT 
      And that's time. 
     
    
      613 
      CLINTON: I want to be a president who takes ca... 
      CLINTON 
      I want to be a president who takes care of the... 
     
    
      614 
      (APPLAUSE) 
      CLINTON 
      (APPLAUSE) 
     
    
      615 
      HOLT: Thank you. 
      HOLT 
      Thank you. 
     
    
      616 
      Senator Sanders? 
      HOLT 
      Senator Sanders? 
     
    
      617 
      SANDERS: Well, Secretary Clinton was right and... 
      SANDERS 
      Well, Secretary Clinton was right and what I d... 
     
    
      618 
      Now, we are a great nation -- and we've heard ... 
      SANDERS 
      Now, we are a great nation -- and we've heard ... 
     
    
      619 
      We've got to get rid of Super PACs, we've got ... 
      SANDERS 
      We've got to get rid of Super PACs, we've got ... 
     
    
      620 
      HOLT: All right. Well thank you and thanks to ... 
      HOLT 
      All right. Well thank you and thanks to all of... 
     
    
      621 
      I also want to thank the Congressional Black C... 
      HOLT 
      I also want to thank the Congressional Black C... 
     
  
622 rows × 3 columns
In [12]:
    
applause_ds = dataset[dataset.speach == "(APPLAUSE)"]
len(applause_ds)
    
    Out[12]:
34
In [13]:
    
applause_ds.speaker.value_counts()
    
    Out[13]:
SANDERS     12
CLINTON     12
O'MALLEY     7
HOLT         3
Name: speaker, dtype: int64
In [14]:
    
applause_counts = applause_ds.speaker.value_counts().sort_values()
bottom = [index for index, item in enumerate(applause_counts.index)]
plt.barh(bottom, width=applause_counts, color="orange", linewidth=0)
y_labels = ["%s %.1f%%" % (item, 100.0*applause_counts[item]/len(applause_ds)) for index,item in enumerate(applause_counts.index)]
plt.yticks(np.array(bottom)+0.4, y_labels)
applause_counts
    
    Out[14]:
HOLT         3
O'MALLEY     7
SANDERS     12
CLINTON     12
Name: speaker, dtype: int64
    
 
In [15]:
    
word_count = lambda x: len(re.findall("[A-Z]{2,}(?![a-z])|[A-Z][a-z]+(?=[A-Z])|[\'\w\-]+",x))
    
In [16]:
    
dataset["word_count"] = dataset.speach.apply(word_count)
dataset
    
    Out[16]:
  
    
       
      raw 
      speaker 
      speach 
      word_count 
     
  
  
    
      0 
      HOLT: We'll begin with 45 second opening state... 
      HOLT 
      We'll begin with 45 second opening statements ... 
      14 
     
    
      1 
      CLINTON: Well, good evening. And I want to tha... 
      CLINTON 
      Well, good evening. And I want to thank the Co... 
      31 
     
    
      2 
      You know, I remember well when my youth minist... 
      CLINTON 
      You know, I remember well when my youth minist... 
      67 
     
    
      3 
      And that is our fight still. We have to get th... 
      CLINTON 
      And that is our fight still. We have to get th... 
      50 
     
    
      4 
      I understand that this is the hardest job in t... 
      CLINTON 
      I understand that this is the hardest job in t... 
      42 
     
    
      5 
      (APPLAUSE) 
      CLINTON 
      (APPLAUSE) 
      1 
     
    
      6 
      HOLT: Thank you. Senator Sanders, your opening... 
      HOLT 
      Thank you. Senator Sanders, your opening state... 
      8 
     
    
      7 
      SANDERS: Thank you. As we honor the extraordin... 
      SANDERS 
      Thank you. As we honor the extraordinary life ... 
      88 
     
    
      8 
      SANDERS: And then, to make a bad situation wor... 
      SANDERS 
      And then, to make a bad situation worse, we ha... 
      28 
     
    
      9 
      This campaign is about a political revolution ... 
      SANDERS 
      This campaign is about a political revolution ... 
      18 
     
    
      10 
      HOLT: Senator, thank you. 
      HOLT 
      Senator, thank you. 
      3 
     
    
      11 
      (APPLAUSE) 
      HOLT 
      (APPLAUSE) 
      1 
     
    
      12 
      And Governor O'Malley, your opening statement,... 
      HOLT 
      And Governor O'Malley, your opening statement,... 
      7 
     
    
      13 
      O'MALLEY: Thank you. My name is Martin O'Malle... 
      O'MALLEY 
      Thank you. My name is Martin O'Malley, I was b... 
      21 
     
    
      14 
      And I want to thank the people of South Caroli... 
      O'MALLEY 
      And I want to thank the people of South Caroli... 
      38 
     
    
      15 
      You taught us, in fact, in keeping with Dr. Ki... 
      O'MALLEY 
      You taught us, in fact, in keeping with Dr. Ki... 
      37 
     
    
      16 
      Eight years ago, you brought forward a new lea... 
      O'MALLEY 
      Eight years ago, you brought forward a new lea... 
      34 
     
    
      17 
      But in order to make good on the promise of eq... 
      O'MALLEY 
      But in order to make good on the promise of eq... 
      47 
     
    
      18 
      We need new leadership. We need to come togeth... 
      O'MALLEY 
      We need new leadership. We need to come togeth... 
      23 
     
    
      19 
      That's why I'm running for president. I need y... 
      O'MALLEY 
      That's why I'm running for president. I need y... 
      26 
     
    
      20 
      Thank you. 
      O'MALLEY 
      Thank you. 
      2 
     
    
      21 
      HOLT: All right. And Governor, thank you. 
      HOLT 
      All right. And Governor, thank you. 
      6 
     
    
      22 
      (APPLAUSE) 
      HOLT 
      (APPLAUSE) 
      1 
     
    
      23 
      HOLT: All right, to our first question, now. T... 
      HOLT 
      All right, to our first question, now. The fir... 
      18 
     
    
      24 
      President Obama came to office determined to s... 
      HOLT 
      President Obama came to office determined to s... 
      52 
     
    
      25 
      Senator Sanders. 
      HOLT 
      Senator Sanders. 
      2 
     
    
      26 
      SANDERS: Well, that's what our campaign is abo... 
      SANDERS 
      Well, that's what our campaign is about. It is... 
      68 
     
    
      27 
      So, what my first days are about is bringing A... 
      SANDERS 
      So, what my first days are about is bringing A... 
      61 
     
    
      28 
      (APPLAUSE) 
      SANDERS 
      (APPLAUSE) 
      1 
     
    
      29 
      HOLT: Secretary Clinton, same question, my fir... 
      HOLT 
      Secretary Clinton, same question, my first 100... 
      16 
     
    
      ... 
      ... 
      ... 
      ... 
      ... 
     
    
      592 
      HOLT: Welcome back everybody. Finally, before ... 
      HOLT 
      Welcome back everybody. Finally, before we go ... 
      48 
     
    
      593 
      And, we'll start with Governor O'Malley. 
      HOLT 
      And, we'll start with Governor O'Malley. 
      6 
     
    
      594 
      (LAUGHTER) 
      HOLT 
      (LAUGHTER) 
      1 
     
    
      595 
      HOLT: Didn't see that coming, did you? 
      HOLT 
      Didn't see that coming, did you? 
      6 
     
    
      596 
      O'MALLEY: Yes, but we're going to have to get ... 
      O'MALLEY 
      Yes, but we're going to have to get 20 minutes... 
      14 
     
    
      597 
      (LAUGHTER) 
      O'MALLEY 
      (LAUGHTER) 
      1 
     
    
      598 
      MITCHELL: ...too long (ph). 
      MITCHELL 
      ...too long (ph). 
      3 
     
    
      599 
      O'MALLEY: I believe there are many issues. I h... 
      O'MALLEY 
      I believe there are many issues. I have 60 sec... 
      12 
     
    
      600 
      HOLT: Sixty seconds, we'd appreciate it. 
      HOLT 
      Sixty seconds, we'd appreciate it. 
      5 
     
    
      601 
      O'MALLEY: There are so many issues that we hav... 
      O'MALLEY 
      There are so many issues that we haven't been ... 
      61 
     
    
      602 
      (APPLAUSE) 
      O'MALLEY 
      (APPLAUSE) 
      1 
     
    
      603 
      O'MALLEY: We haven't discussed the fact that i... 
      O'MALLEY 
      We haven't discussed the fact that in our hemi... 
      27 
     
    
      604 
      I guess the bottom line is this, look we are a... 
      O'MALLEY 
      I guess the bottom line is this, look we are a... 
      80 
     
    
      605 
      We're on the threshold of a new era of America... 
      O'MALLEY 
      We're on the threshold of a new era of America... 
      33 
     
    
      606 
      HOLT: And that's time. 
      HOLT 
      And that's time. 
      3 
     
    
      607 
      O'MALLEY: Thanks a lot. 
      O'MALLEY 
      Thanks a lot. 
      3 
     
    
      608 
      HOLT: Secretary Clinton? 
      HOLT 
      Secretary Clinton? 
      2 
     
    
      609 
      CLINTON: Well Lester, I spent a lot of time la... 
      CLINTON 
      Well Lester, I spent a lot of time last week b... 
      72 
     
    
      610 
      He had request for help and he had basically s... 
      CLINTON 
      He had request for help and he had basically s... 
      38 
     
    
      611 
      So I sent my top campaign aide down there to t... 
      CLINTON 
      So I sent my top campaign aide down there to t... 
      59 
     
    
      612 
      HOLT: And that's time. 
      HOLT 
      And that's time. 
      3 
     
    
      613 
      CLINTON: I want to be a president who takes ca... 
      CLINTON 
      I want to be a president who takes care of the... 
      25 
     
    
      614 
      (APPLAUSE) 
      CLINTON 
      (APPLAUSE) 
      1 
     
    
      615 
      HOLT: Thank you. 
      HOLT 
      Thank you. 
      2 
     
    
      616 
      Senator Sanders? 
      HOLT 
      Senator Sanders? 
      2 
     
    
      617 
      SANDERS: Well, Secretary Clinton was right and... 
      SANDERS 
      Well, Secretary Clinton was right and what I d... 
      32 
     
    
      618 
      Now, we are a great nation -- and we've heard ... 
      SANDERS 
      Now, we are a great nation -- and we've heard ... 
      58 
     
    
      619 
      We've got to get rid of Super PACs, we've got ... 
      SANDERS 
      We've got to get rid of Super PACs, we've got ... 
      73 
     
    
      620 
      HOLT: All right. Well thank you and thanks to ... 
      HOLT 
      All right. Well thank you and thanks to all of... 
      28 
     
    
      621 
      I also want to thank the Congressional Black C... 
      HOLT 
      I also want to thank the Congressional Black C... 
      33 
     
  
622 rows × 4 columns
In [17]:
    
words_ds = dataset[dataset.speaker.isin(["CLINTON","SANDERS","O'MALLEY"])]
    
In [18]:
    
words_counts = words_ds.pivot_table(values="word_count", index="speaker", columns=None, aggfunc='mean',).sort_values()
bottom = [index for index, item in enumerate(words_counts.index)]
plt.barh(bottom, width=words_counts, color="orange", linewidth=0)
y_labels = ["%s %.1f words/paragraph" % (item, words_counts[item]) for index,item in enumerate(words_counts.index)]
plt.yticks(np.array(bottom)+0.4, y_labels)
words_counts
    
    Out[18]:
speaker
O'MALLEY    25.115044
SANDERS     27.351190
CLINTON     32.656489
Name: word_count, dtype: float64
    
 
In [19]:
    
words_counts = words_ds.pivot_table(values="word_count", index="speaker", columns=None, aggfunc='sum',).sort_values()
bottom = [index for index, item in enumerate(words_counts.index)]
plt.barh(bottom, width=words_counts, color="orange", linewidth=0)
y_labels = ["%s %d (%.1f%%)" % (item, words_counts[item], 100.0*words_counts[item]/np.sum(words_counts)) for index,item in enumerate(words_counts.index)]
plt.yticks(np.array(bottom)+0.4, y_labels)
words_counts
    
    Out[19]:
speaker
O'MALLEY    2838
CLINTON     4278
SANDERS     4595
Name: word_count, dtype: int64
    
 
In [20]:
    
speaker_dict = {value:index for index,value in enumerate(words_ds.speaker.unique())}
speaker_dict
    
    Out[20]:
{'CLINTON': 0, "O'MALLEY": 2, 'SANDERS': 1}
In [21]:
    
words_ds["speaker_no"] = words_ds.speaker.map(speaker_dict)
words_ds
    
    
-c:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead
See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
    Out[21]:
  
    
       
      raw 
      speaker 
      speach 
      word_count 
      speaker_no 
     
  
  
    
      1 
      CLINTON: Well, good evening. And I want to tha... 
      CLINTON 
      Well, good evening. And I want to thank the Co... 
      31 
      0 
     
    
      2 
      You know, I remember well when my youth minist... 
      CLINTON 
      You know, I remember well when my youth minist... 
      67 
      0 
     
    
      3 
      And that is our fight still. We have to get th... 
      CLINTON 
      And that is our fight still. We have to get th... 
      50 
      0 
     
    
      4 
      I understand that this is the hardest job in t... 
      CLINTON 
      I understand that this is the hardest job in t... 
      42 
      0 
     
    
      5 
      (APPLAUSE) 
      CLINTON 
      (APPLAUSE) 
      1 
      0 
     
    
      7 
      SANDERS: Thank you. As we honor the extraordin... 
      SANDERS 
      Thank you. As we honor the extraordinary life ... 
      88 
      1 
     
    
      8 
      SANDERS: And then, to make a bad situation wor... 
      SANDERS 
      And then, to make a bad situation worse, we ha... 
      28 
      1 
     
    
      9 
      This campaign is about a political revolution ... 
      SANDERS 
      This campaign is about a political revolution ... 
      18 
      1 
     
    
      13 
      O'MALLEY: Thank you. My name is Martin O'Malle... 
      O'MALLEY 
      Thank you. My name is Martin O'Malley, I was b... 
      21 
      2 
     
    
      14 
      And I want to thank the people of South Caroli... 
      O'MALLEY 
      And I want to thank the people of South Caroli... 
      38 
      2 
     
    
      15 
      You taught us, in fact, in keeping with Dr. Ki... 
      O'MALLEY 
      You taught us, in fact, in keeping with Dr. Ki... 
      37 
      2 
     
    
      16 
      Eight years ago, you brought forward a new lea... 
      O'MALLEY 
      Eight years ago, you brought forward a new lea... 
      34 
      2 
     
    
      17 
      But in order to make good on the promise of eq... 
      O'MALLEY 
      But in order to make good on the promise of eq... 
      47 
      2 
     
    
      18 
      We need new leadership. We need to come togeth... 
      O'MALLEY 
      We need new leadership. We need to come togeth... 
      23 
      2 
     
    
      19 
      That's why I'm running for president. I need y... 
      O'MALLEY 
      That's why I'm running for president. I need y... 
      26 
      2 
     
    
      20 
      Thank you. 
      O'MALLEY 
      Thank you. 
      2 
      2 
     
    
      26 
      SANDERS: Well, that's what our campaign is abo... 
      SANDERS 
      Well, that's what our campaign is about. It is... 
      68 
      1 
     
    
      27 
      So, what my first days are about is bringing A... 
      SANDERS 
      So, what my first days are about is bringing A... 
      61 
      1 
     
    
      28 
      (APPLAUSE) 
      SANDERS 
      (APPLAUSE) 
      1 
      1 
     
    
      30 
      CLINTON: I would work quickly to present to th... 
      CLINTON 
      I would work quickly to present to the Congres... 
      35 
      0 
     
    
      31 
      I would also... 
      CLINTON 
      I would also... 
      3 
      0 
     
    
      32 
      (APPLAUSE) 
      CLINTON 
      (APPLAUSE) 
      1 
      0 
     
    
      33 
      I would also be presenting my plans to build o... 
      CLINTON 
      I would also be presenting my plans to build o... 
      67 
      0 
     
    
      34 
      And third, I would be working, in every way th... 
      CLINTON 
      And third, I would be working, in every way th... 
      78 
      0 
     
    
      35 
      (APPLAUSE) 
      CLINTON 
      (APPLAUSE) 
      1 
      0 
     
    
      37 
      O'MALLEY: Thank you. First of all, I would lay... 
      O'MALLEY 
      Thank you. First of all, I would lay out an ag... 
      77 
      2 
     
    
      38 
      Secondly, I believe the greatest business oppo... 
      O'MALLEY 
      Secondly, I believe the greatest business oppo... 
      48 
      2 
     
    
      39 
      (APPLAUSE) 
      O'MALLEY 
      (APPLAUSE) 
      1 
      2 
     
    
      41 
      O'MALLEY: Finally -- I'm sorry, that was secon... 
      O'MALLEY 
      Finally -- I'm sorry, that was second, Lester. 
      8 
      2 
     
    
      42 
      O'MALLEY: And third and finally, we need a new... 
      O'MALLEY 
      And third and finally, we need a new agenda fo... 
      77 
      2 
     
    
      ... 
      ... 
      ... 
      ... 
      ... 
      ... 
     
    
      573 
      We just have to do more of it, and we have to ... 
      CLINTON 
      We just have to do more of it, and we have to ... 
      32 
      0 
     
    
      576 
      SANDERS: Great ideas, Governor O'Malley, Secre... 
      SANDERS 
      Great ideas, Governor O'Malley, Secretary Clin... 
      27 
      1 
     
    
      577 
      So here's a promise that I make -- and I menti... 
      SANDERS 
      So here's a promise that I make -- and I menti... 
      43 
      1 
     
    
      578 
      Here's a promise. If elected president, Goldma... 
      SANDERS 
      Here's a promise. If elected president, Goldma... 
      24 
      1 
     
    
      579 
      (APPLAUSE) 
      SANDERS 
      (APPLAUSE) 
      1 
      1 
     
    
      581 
      SANDERS: I was asked a question. You know, one... 
      SANDERS 
      I was asked a question. You know, one of the t... 
      58 
      1 
     
    
      582 
      I have avoided doing that. Trying to run an is... 
      SANDERS 
      I have avoided doing that. Trying to run an is... 
      11 
      1 
     
    
      583 
      (APPLAUSE) 
      SANDERS 
      (APPLAUSE) 
      1 
      1 
     
    
      584 
      SANDERS: I was asked a question. 
      SANDERS 
      I was asked a question. 
      5 
      1 
     
    
      586 
      SANDERS: Well -- then if I don't answer it, th... 
      SANDERS 
      Well -- then if I don't answer it, then there'... 
      17 
      1 
     
    
      587 
      (LAUGHTER) 
      SANDERS 
      (LAUGHTER) 
      1 
      1 
     
    
      588 
      And I mean this seriously. You know that. We'v... 
      SANDERS 
      And I mean this seriously. You know that. We'v... 
      51 
      1 
     
    
      589 
      (APPLAUSE) 
      SANDERS 
      (APPLAUSE) 
      1 
      1 
     
    
      596 
      O'MALLEY: Yes, but we're going to have to get ... 
      O'MALLEY 
      Yes, but we're going to have to get 20 minutes... 
      14 
      2 
     
    
      597 
      (LAUGHTER) 
      O'MALLEY 
      (LAUGHTER) 
      1 
      2 
     
    
      599 
      O'MALLEY: I believe there are many issues. I h... 
      O'MALLEY 
      I believe there are many issues. I have 60 sec... 
      12 
      2 
     
    
      601 
      O'MALLEY: There are so many issues that we hav... 
      O'MALLEY 
      There are so many issues that we haven't been ... 
      61 
      2 
     
    
      602 
      (APPLAUSE) 
      O'MALLEY 
      (APPLAUSE) 
      1 
      2 
     
    
      603 
      O'MALLEY: We haven't discussed the fact that i... 
      O'MALLEY 
      We haven't discussed the fact that in our hemi... 
      27 
      2 
     
    
      604 
      I guess the bottom line is this, look we are a... 
      O'MALLEY 
      I guess the bottom line is this, look we are a... 
      80 
      2 
     
    
      605 
      We're on the threshold of a new era of America... 
      O'MALLEY 
      We're on the threshold of a new era of America... 
      33 
      2 
     
    
      607 
      O'MALLEY: Thanks a lot. 
      O'MALLEY 
      Thanks a lot. 
      3 
      2 
     
    
      609 
      CLINTON: Well Lester, I spent a lot of time la... 
      CLINTON 
      Well Lester, I spent a lot of time last week b... 
      72 
      0 
     
    
      610 
      He had request for help and he had basically s... 
      CLINTON 
      He had request for help and he had basically s... 
      38 
      0 
     
    
      611 
      So I sent my top campaign aide down there to t... 
      CLINTON 
      So I sent my top campaign aide down there to t... 
      59 
      0 
     
    
      613 
      CLINTON: I want to be a president who takes ca... 
      CLINTON 
      I want to be a president who takes care of the... 
      25 
      0 
     
    
      614 
      (APPLAUSE) 
      CLINTON 
      (APPLAUSE) 
      1 
      0 
     
    
      617 
      SANDERS: Well, Secretary Clinton was right and... 
      SANDERS 
      Well, Secretary Clinton was right and what I d... 
      32 
      1 
     
    
      618 
      Now, we are a great nation -- and we've heard ... 
      SANDERS 
      Now, we are a great nation -- and we've heard ... 
      58 
      1 
     
    
      619 
      We've got to get rid of Super PACs, we've got ... 
      SANDERS 
      We've got to get rid of Super PACs, we've got ... 
      73 
      1 
     
  
412 rows × 5 columns
In [22]:
    
cv = CountVectorizer()
count_matrix = cv.fit_transform(words_ds.speach)
count_matrix = count_matrix.toarray()
word_count = pd.DataFrame(cv.get_feature_names(), columns=["word"])
word_count["count"] = count_matrix.sum(axis=0)
word_count = word_count.sort_values(by="count", ascending=False).reset_index(drop=True)
word_count[:]
    
    Out[22]:
  
    
       
      word 
      count 
     
  
  
    
      0 
      the 
      546 
     
    
      1 
      to 
      414 
     
    
      2 
      and 
      362 
     
    
      3 
      of 
      293 
     
    
      4 
      we 
      280 
     
    
      5 
      that 
      274 
     
    
      6 
      in 
      223 
     
    
      7 
      is 
      157 
     
    
      8 
      have 
      144 
     
    
      9 
      it 
      126 
     
    
      10 
      on 
      117 
     
    
      11 
      you 
      104 
     
    
      12 
      what 
      102 
     
    
      13 
      for 
      102 
     
    
      14 
      people 
      86 
     
    
      15 
      our 
      85 
     
    
      16 
      with 
      82 
     
    
      17 
      this 
      78 
     
    
      18 
      do 
      71 
     
    
      19 
      not 
      69 
     
    
      20 
      but 
      62 
     
    
      21 
      as 
      61 
     
    
      22 
      are 
      58 
     
    
      23 
      be 
      55 
     
    
      24 
      can 
      51 
     
    
      25 
      applause 
      49 
     
    
      26 
      need 
      49 
     
    
      27 
      was 
      48 
     
    
      28 
      so 
      48 
     
    
      29 
      all 
      47 
     
    
      ... 
      ... 
      ... 
     
    
      1938 
      leaders 
      1 
     
    
      1939 
      boldly 
      1 
     
    
      1940 
      lay 
      1 
     
    
      1941 
      book 
      1 
     
    
      1942 
      border 
      1 
     
    
      1943 
      larger 
      1 
     
    
      1944 
      land 
      1 
     
    
      1945 
      lake 
      1 
     
    
      1946 
      laid 
      1 
     
    
      1947 
      lady 
      1 
     
    
      1948 
      legally 
      1 
     
    
      1949 
      less 
      1 
     
    
      1950 
      lost 
      1 
     
    
      1951 
      lesson 
      1 
     
    
      1952 
      loop 
      1 
     
    
      1953 
      biosurveillance 
      1 
     
    
      1954 
      living 
      1 
     
    
      1955 
      blame 
      1 
     
    
      1956 
      listened 
      1 
     
    
      1957 
      listen 
      1 
     
    
      1958 
      lindsey 
      1 
     
    
      1959 
      limit 
      1 
     
    
      1960 
      block 
      1 
     
    
      1961 
      lifting 
      1 
     
    
      1962 
      liable 
      1 
     
    
      1963 
      liability 
      1 
     
    
      1964 
      level 
      1 
     
    
      1965 
      lethal 
      1 
     
    
      1966 
      bloodiest 
      1 
     
    
      1967 
      zero 
      1 
     
  
1968 rows × 2 columns
In [23]:
    
cl = MultinomialNB()
cl.fit(count_matrix, words_ds.speaker=="SANDERS")
df_vocab = pd.DataFrame(list(cv.vocabulary_.keys()), columns=["Vocab"])
df_vocab["Vocab_index"] = cv.vocabulary_.values()
df_vocab = df_vocab.sort_values("Vocab_index").reset_index(drop=True)
df_vocab["proba"] = cl.feature_log_prob_[0]
df_vocab["anti_proba"] = cl.feature_log_prob_[1]
df_vocab["difference"] = cl.feature_log_prob_[0] - cl.feature_log_prob_[1]
df_vocab.sort_values("difference", ascending=True)
    
    Out[23]:
  
    
       
      Vocab 
      Vocab_index 
      proba 
      anti_proba 
      difference 
     
  
  
    
      1064 
      major 
      1064 
      -9.080573 
      -6.454097 
      -2.626476 
     
    
      759 
      goldman 
      759 
      -9.080573 
      -6.677241 
      -2.403332 
     
    
      1507 
      sachs 
      1507 
      -9.080573 
      -6.677241 
      -2.403332 
     
    
      423 
      countries 
      423 
      -9.080573 
      -6.810772 
      -2.269801 
     
    
      406 
      contributions 
      406 
      -9.080573 
      -6.810772 
      -2.269801 
     
    
      418 
      corrupt 
      418 
      -9.080573 
      -6.810772 
      -2.269801 
     
    
      1627 
      spending 
      1627 
      -9.080573 
      -6.964923 
      -2.115650 
     
    
      381 
      companies 
      381 
      -9.080573 
      -6.964923 
      -2.115650 
     
    
      1411 
      real 
      1411 
      -9.080573 
      -6.964923 
      -2.115650 
     
    
      1852 
      vermont 
      1852 
      -9.080573 
      -6.964923 
      -2.115650 
     
    
      407 
      contributors 
      407 
      -9.080573 
      -7.147245 
      -1.933329 
     
    
      1480 
      revolution 
      1480 
      -9.080573 
      -7.147245 
      -1.933329 
     
    
      1070 
      man 
      1070 
      -9.080573 
      -7.147245 
      -1.933329 
     
    
      1481 
      rhetoric 
      1481 
      -9.080573 
      -7.147245 
      -1.933329 
     
    
      1776 
      transform 
      1776 
      -9.080573 
      -7.147245 
      -1.933329 
     
    
      1672 
      super 
      1672 
      -9.080573 
      -7.147245 
      -1.933329 
     
    
      1484 
      rid 
      1484 
      -9.080573 
      -7.147245 
      -1.933329 
     
    
      1782 
      treasury 
      1782 
      -9.080573 
      -7.147245 
      -1.933329 
     
    
      1576 
      she 
      1576 
      -9.080573 
      -7.147245 
      -1.933329 
     
    
      447 
      crumbling 
      447 
      -9.080573 
      -7.370388 
      -1.710185 
     
    
      997 
      latino 
      997 
      -9.080573 
      -7.370388 
      -1.710185 
     
    
      795 
      hampshire 
      795 
      -9.080573 
      -7.370388 
      -1.710185 
     
    
      1712 
      terms 
      1712 
      -8.387426 
      -6.677241 
      -1.710185 
     
    
      1675 
      supported 
      1675 
      -9.080573 
      -7.370388 
      -1.710185 
     
    
      1346 
      private 
      1346 
      -8.387426 
      -6.677241 
      -1.710185 
     
    
      1343 
      priority 
      1343 
      -9.080573 
      -7.370388 
      -1.710185 
     
    
      995 
      largest 
      995 
      -9.080573 
      -7.370388 
      -1.710185 
     
    
      151 
      area 
      151 
      -9.080573 
      -7.370388 
      -1.710185 
     
    
      133 
      anti 
      133 
      -9.080573 
      -7.370388 
      -1.710185 
     
    
      1238 
      pacs 
      1238 
      -9.080573 
      -7.370388 
      -1.710185 
     
    
      ... 
      ... 
      ... 
      ... 
      ... 
      ... 
     
    
      128 
      andrea 
      128 
      -6.307985 
      -7.658070 
      1.350086 
     
    
      1719 
      thank 
      1719 
      -6.682678 
      -8.063535 
      1.380857 
     
    
      718 
      frank 
      718 
      -6.682678 
      -8.063535 
      1.380857 
     
    
      23 
      30 
      23 
      -7.288814 
      -8.756682 
      1.467869 
     
    
      481 
      defend 
      481 
      -7.288814 
      -8.756682 
      1.467869 
     
    
      1084 
      matter 
      1084 
      -7.288814 
      -8.756682 
      1.467869 
     
    
      1799 
      try 
      1799 
      -7.288814 
      -8.756682 
      1.467869 
     
    
      809 
      haven 
      809 
      -7.288814 
      -8.756682 
      1.467869 
     
    
      918 
      intelligence 
      918 
      -7.288814 
      -8.756682 
      1.467869 
     
    
      86 
      age 
      86 
      -7.288814 
      -8.756682 
      1.467869 
     
    
      621 
      equal 
      621 
      -7.288814 
      -8.756682 
      1.467869 
     
    
      1874 
      visiting 
      1874 
      -7.288814 
      -8.756682 
      1.467869 
     
    
      156 
      around 
      156 
      -7.288814 
      -8.756682 
      1.467869 
     
    
      562 
      door 
      562 
      -7.288814 
      -8.756682 
      1.467869 
     
    
      707 
      forces 
      707 
      -7.288814 
      -8.756682 
      1.467869 
     
    
      1842 
      use 
      1842 
      -7.288814 
      -8.756682 
      1.467869 
     
    
      1957 
      year 
      1957 
      -6.515624 
      -8.063535 
      1.547911 
     
    
      972 
      keep 
      972 
      -7.134663 
      -8.756682 
      1.622019 
     
    
      273 
      build 
      273 
      -7.134663 
      -8.756682 
      1.622019 
     
    
      1634 
      stage 
      1634 
      -7.001132 
      -8.756682 
      1.755551 
     
    
      59 
      actually 
      59 
      -6.307985 
      -8.063535 
      1.755551 
     
    
      169 
      attacks 
      169 
      -7.001132 
      -8.756682 
      1.755551 
     
    
      420 
      costs 
      420 
      -7.001132 
      -8.756682 
      1.755551 
     
    
      791 
      had 
      791 
      -6.307985 
      -8.063535 
      1.755551 
     
    
      365 
      come 
      365 
      -7.001132 
      -8.756682 
      1.755551 
     
    
      1589 
      since 
      1589 
      -6.777988 
      -8.756682 
      1.978694 
     
    
      553 
      dodd 
      553 
      -6.777988 
      -8.756682 
      1.978694 
     
    
      84 
      again 
      84 
      -6.777988 
      -8.756682 
      1.978694 
     
    
      1281 
      plan 
      1281 
      -6.682678 
      -8.756682 
      2.074004 
     
    
      1021 
      lester 
      1021 
      -6.682678 
      -8.756682 
      2.074004 
     
  
1968 rows × 5 columns
In [24]:
    
cl = MultinomialNB()
cl.fit(count_matrix, words_ds.speaker=="CLINTON")
df_vocab = pd.DataFrame(list(cv.vocabulary_.keys()), columns=["Vocab"])
df_vocab["Vocab_index"] = cv.vocabulary_.values()
df_vocab = df_vocab.sort_values("Vocab_index").reset_index(drop=True)
df_vocab["proba"] = cl.feature_log_prob_[0]
df_vocab["anti_proba"] = cl.feature_log_prob_[1]
df_vocab["difference"] = cl.feature_log_prob_[0] - cl.feature_log_prob_[1]
df_vocab.sort_values("difference", ascending=True)
    
    Out[24]:
  
    
       
      Vocab 
      Vocab_index 
      proba 
      anti_proba 
      difference 
     
  
  
    
      553 
      dodd 
      553 
      -9.111183 
      -6.410175 
      -2.701008 
     
    
      1799 
      try 
      1799 
      -9.111183 
      -6.921001 
      -2.190182 
     
    
      718 
      frank 
      718 
      -8.418036 
      -6.314865 
      -2.103171 
     
    
      1925 
      white 
      1925 
      -9.111183 
      -7.103322 
      -2.007861 
     
    
      1287 
      pleased 
      1287 
      -9.111183 
      -7.103322 
      -2.007861 
     
    
      1608 
      someone 
      1608 
      -9.111183 
      -7.103322 
      -2.007861 
     
    
      1628 
      spent 
      1628 
      -9.111183 
      -7.103322 
      -2.007861 
     
    
      625 
      especially 
      625 
      -9.111183 
      -7.103322 
      -2.007861 
     
    
      927 
      introduced 
      927 
      -9.111183 
      -7.103322 
      -2.007861 
     
    
      1487 
      rights 
      1487 
      -8.418036 
      -6.515535 
      -1.902500 
     
    
      1516 
      sanctions 
      1516 
      -9.111183 
      -7.326466 
      -1.784717 
     
    
      1349 
      problem 
      1349 
      -9.111183 
      -7.326466 
      -1.784717 
     
    
      895 
      incomes 
      895 
      -9.111183 
      -7.326466 
      -1.784717 
     
    
      650 
      experience 
      650 
      -9.111183 
      -7.326466 
      -1.784717 
     
    
      370 
      comments 
      370 
      -9.111183 
      -7.326466 
      -1.784717 
     
    
      1912 
      week 
      1912 
      -9.111183 
      -7.326466 
      -1.784717 
     
    
      679 
      fighters 
      679 
      -9.111183 
      -7.326466 
      -1.784717 
     
    
      147 
      approach 
      147 
      -9.111183 
      -7.326466 
      -1.784717 
     
    
      1544 
      sector 
      1544 
      -9.111183 
      -7.326466 
      -1.784717 
     
    
      1563 
      serious 
      1563 
      -9.111183 
      -7.326466 
      -1.784717 
     
    
      16 
      2011 
      16 
      -9.111183 
      -7.326466 
      -1.784717 
     
    
      791 
      had 
      791 
      -7.724888 
      -6.073703 
      -1.651186 
     
    
      420 
      costs 
      420 
      -8.418036 
      -6.766850 
      -1.651186 
     
    
      1877 
      voted 
      1877 
      -8.012571 
      -6.515535 
      -1.497035 
     
    
      1639 
      standing 
      1639 
      -9.111183 
      -7.614148 
      -1.497035 
     
    
      1786 
      treaty 
      1786 
      -9.111183 
      -7.614148 
      -1.497035 
     
    
      55 
      acted 
      55 
      -9.111183 
      -7.614148 
      -1.497035 
     
    
      699 
      flint 
      699 
      -9.111183 
      -7.614148 
      -1.497035 
     
    
      1878 
      votes 
      1878 
      -9.111183 
      -7.614148 
      -1.497035 
     
    
      1041 
      lone 
      1041 
      -9.111183 
      -7.614148 
      -1.497035 
     
    
      ... 
      ... 
      ... 
      ... 
      ... 
      ... 
     
    
      1274 
      ph 
      1274 
      -7.319423 
      -8.712760 
      1.393337 
     
    
      1371 
      provide 
      1371 
      -7.319423 
      -8.712760 
      1.393337 
     
    
      1852 
      vermont 
      1852 
      -7.319423 
      -8.712760 
      1.393337 
     
    
      7 
      15 
      7 
      -7.319423 
      -8.712760 
      1.393337 
     
    
      809 
      haven 
      809 
      -7.319423 
      -8.712760 
      1.393337 
     
    
      23 
      30 
      23 
      -7.319423 
      -8.712760 
      1.393337 
     
    
      1935 
      without 
      1935 
      -7.319423 
      -8.712760 
      1.393337 
     
    
      711 
      forward 
      711 
      -6.546233 
      -8.019613 
      1.473379 
     
    
      1540 
      seconds 
      1540 
      -7.165273 
      -8.712760 
      1.547487 
     
    
      632 
      ever 
      632 
      -7.165273 
      -8.712760 
      1.547487 
     
    
      729 
      front 
      729 
      -7.165273 
      -8.712760 
      1.547487 
     
    
      271 
      budget 
      271 
      -7.165273 
      -8.712760 
      1.547487 
     
    
      1792 
      true 
      1792 
      -7.165273 
      -8.712760 
      1.547487 
     
    
      418 
      corrupt 
      418 
      -7.165273 
      -8.712760 
      1.547487 
     
    
      1271 
      person 
      1271 
      -7.165273 
      -8.712760 
      1.547487 
     
    
      1267 
      percent 
      1267 
      -7.165273 
      -8.712760 
      1.547487 
     
    
      406 
      contributions 
      406 
      -7.165273 
      -8.712760 
      1.547487 
     
    
      561 
      done 
      561 
      -7.165273 
      -8.712760 
      1.547487 
     
    
      952 
      issues 
      952 
      -7.165273 
      -8.712760 
      1.547487 
     
    
      423 
      countries 
      423 
      -7.165273 
      -8.712760 
      1.547487 
     
    
      1537 
      second 
      1537 
      -7.031741 
      -8.712760 
      1.681019 
     
    
      1542 
      secretary 
      1542 
      -5.645447 
      -7.326466 
      1.681019 
     
    
      1507 
      sachs 
      1507 
      -7.031741 
      -8.712760 
      1.681019 
     
    
      759 
      goldman 
      759 
      -7.031741 
      -8.712760 
      1.681019 
     
    
      128 
      andrea 
      128 
      -6.277969 
      -8.019613 
      1.741643 
     
    
      1731 
      things 
      1731 
      -6.220811 
      -8.019613 
      1.798802 
     
    
      1712 
      terms 
      1712 
      -6.913958 
      -8.712760 
      1.798802 
     
    
      1064 
      major 
      1064 
      -6.808598 
      -8.712760 
      1.904162 
     
    
      1728 
      these 
      1728 
      -6.472125 
      -8.712760 
      2.240635 
     
    
      350 
      clinton 
      350 
      -6.066660 
      -8.712760 
      2.646100 
     
  
1968 rows × 5 columns
In [25]:
    
cl = MultinomialNB()
cl.fit(count_matrix, words_ds.speaker=="O'MALLY")
df_vocab = pd.DataFrame(list(cv.vocabulary_.keys()), columns=["Vocab"])
df_vocab["Vocab_index"] = cv.vocabulary_.values()
df_vocab = df_vocab.sort_values("Vocab_index").reset_index(drop=True)
df_vocab["proba"] = cl.feature_log_prob_[0]
df_vocab["anti_proba"] = cl.feature_log_prob_[1]
df_vocab["difference"] = cl.feature_log_prob_[0] - cl.feature_log_prob_[1]
df_vocab.sort_values("difference", ascending=True)
    
    Out[25]:
  
    
       
      Vocab 
      Vocab_index 
      proba 
      anti_proba 
      difference 
     
  
  
    
      983 
      knew 
      983 
      -8.792398 
      -7.584773 
      -1.207625 
     
    
      1129 
      momentum 
      1129 
      -8.792398 
      -7.584773 
      -1.207625 
     
    
      1127 
      mom 
      1127 
      -8.792398 
      -7.584773 
      -1.207625 
     
    
      1124 
      mixed 
      1124 
      -8.792398 
      -7.584773 
      -1.207625 
     
    
      1122 
      mission 
      1122 
      -8.792398 
      -7.584773 
      -1.207625 
     
    
      1121 
      missing 
      1121 
      -8.792398 
      -7.584773 
      -1.207625 
     
    
      1120 
      minutes 
      1120 
      -8.792398 
      -7.584773 
      -1.207625 
     
    
      1119 
      minus 
      1119 
      -8.792398 
      -7.584773 
      -1.207625 
     
    
      1131 
      months 
      1131 
      -8.792398 
      -7.584773 
      -1.207625 
     
    
      1116 
      mindful 
      1116 
      -8.792398 
      -7.584773 
      -1.207625 
     
    
      1111 
      militarize 
      1111 
      -8.792398 
      -7.584773 
      -1.207625 
     
    
      1109 
      midst 
      1109 
      -8.792398 
      -7.584773 
      -1.207625 
     
    
      1107 
      michigan 
      1107 
      -8.792398 
      -7.584773 
      -1.207625 
     
    
      1105 
      message 
      1105 
      -8.792398 
      -7.584773 
      -1.207625 
     
    
      1103 
      mention 
      1103 
      -8.792398 
      -7.584773 
      -1.207625 
     
    
      1102 
      mentally 
      1102 
      -8.792398 
      -7.584773 
      -1.207625 
     
    
      1099 
      memphis 
      1099 
      -8.792398 
      -7.584773 
      -1.207625 
     
    
      1114 
      millionaires 
      1114 
      -8.792398 
      -7.584773 
      -1.207625 
     
    
      1132 
      moon 
      1132 
      -8.792398 
      -7.584773 
      -1.207625 
     
    
      1133 
      moral 
      1133 
      -8.792398 
      -7.584773 
      -1.207625 
     
    
      1140 
      moving 
      1140 
      -8.792398 
      -7.584773 
      -1.207625 
     
    
      1173 
      nothing 
      1173 
      -8.792398 
      -7.584773 
      -1.207625 
     
    
      1171 
      normalize 
      1171 
      -8.792398 
      -7.584773 
      -1.207625 
     
    
      1170 
      normalization 
      1170 
      -8.792398 
      -7.584773 
      -1.207625 
     
    
      1169 
      nonsense 
      1169 
      -8.792398 
      -7.584773 
      -1.207625 
     
    
      1165 
      nightmare 
      1165 
      -8.792398 
      -7.584773 
      -1.207625 
     
    
      1164 
      nickel 
      1164 
      -8.792398 
      -7.584773 
      -1.207625 
     
    
      1160 
      neither 
      1160 
      -8.792398 
      -7.584773 
      -1.207625 
     
    
      1159 
      neighbors 
      1159 
      -8.792398 
      -7.584773 
      -1.207625 
     
    
      1158 
      negotiating 
      1158 
      -8.792398 
      -7.584773 
      -1.207625 
     
    
      ... 
      ... 
      ... 
      ... 
      ... 
      ... 
     
    
      103 
      all 
      103 
      -5.614344 
      -7.584773 
      1.970429 
     
    
      1602 
      so 
      1602 
      -5.593725 
      -7.584773 
      1.991048 
     
    
      1897 
      was 
      1897 
      -5.593725 
      -7.584773 
      1.991048 
     
    
      144 
      applause 
      144 
      -5.573522 
      -7.584773 
      2.011251 
     
    
      1154 
      need 
      1154 
      -5.573522 
      -7.584773 
      2.011251 
     
    
      293 
      can 
      293 
      -5.534301 
      -7.584773 
      2.050472 
     
    
      202 
      be 
      202 
      -5.460193 
      -7.584773 
      2.124580 
     
    
      150 
      are 
      150 
      -5.408007 
      -7.584773 
      2.176766 
     
    
      158 
      as 
      158 
      -5.358411 
      -7.584773 
      2.226363 
     
    
      284 
      but 
      284 
      -5.342410 
      -7.584773 
      2.242363 
     
    
      1172 
      not 
      1172 
      -5.237050 
      -7.584773 
      2.347723 
     
    
      551 
      do 
      551 
      -5.208879 
      -7.584773 
      2.375894 
     
    
      1737 
      this 
      1737 
      -5.116097 
      -7.584773 
      2.468676 
     
    
      1933 
      with 
      1933 
      -5.066704 
      -7.584773 
      2.518069 
     
    
      1223 
      our 
      1223 
      -5.031198 
      -7.584773 
      2.553575 
     
    
      1265 
      people 
      1265 
      -5.019637 
      -7.584773 
      2.565136 
     
    
      705 
      for 
      705 
      -4.850816 
      -7.584773 
      2.733957 
     
    
      1917 
      what 
      1917 
      -4.850816 
      -7.584773 
      2.733957 
     
    
      1961 
      you 
      1961 
      -4.831585 
      -7.584773 
      2.753189 
     
    
      1198 
      on 
      1198 
      -4.714860 
      -7.584773 
      2.869913 
     
    
      953 
      it 
      953 
      -4.641358 
      -7.584773 
      2.943415 
     
    
      808 
      have 
      808 
      -4.508811 
      -7.584773 
      3.075962 
     
    
      943 
      is 
      943 
      -4.422950 
      -7.584773 
      3.161823 
     
    
      886 
      in 
      886 
      -4.073899 
      -7.584773 
      3.510874 
     
    
      1721 
      that 
      1721 
      -3.868774 
      -7.584773 
      3.715999 
     
    
      1903 
      we 
      1903 
      -3.847190 
      -7.584773 
      3.737583 
     
    
      1186 
      of 
      1186 
      -3.801965 
      -7.584773 
      3.782808 
     
    
      127 
      and 
      127 
      -3.591142 
      -7.584773 
      3.993631 
     
    
      1751 
      to 
      1751 
      -3.457266 
      -7.584773 
      4.127507 
     
    
      1722 
      the 
      1722 
      -3.181096 
      -7.584773 
      4.403677 
     
  
1968 rows × 5 columns
In [ ]:
    
    
Content source: TwistedHardware/mltutorial
Similar notebooks: